pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
Creates somthing similar to R DataFrames.. but better
In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
In [2]:
import numpy as np
import pandas as pd
import pylab as plt
import matplotlib
%matplotlib inline
pd.__version__
Out[2]:
In [3]:
values = [5,3,4,8,2,9]
vals = pd.Series(values)
vals
Out[3]:
Each value is now associated with an index. The index itself is an object of class Index
and can be manipulated directly.
In [4]:
vals.index
Out[4]:
In [5]:
vals.values
Out[5]:
In [6]:
vals * 2.5
Out[6]:
We can give named indexes
In [7]:
vals2 = pd.Series(values, index=['tom','sally','jeff','george','pablo','florence'])
vals2
Out[7]:
And use these to get the data we want
In [8]:
vals2[['florence','tom']]
Out[8]:
In [9]:
vals2[['florence','tom','kate']]
Out[9]:
Dealing with missing values
In [10]:
vals3 = vals2[['tom','sally','pablo','florence','ricky','katrin']]
vals3
Out[10]:
Get rid of them
In [11]:
vals3.dropna()
Out[11]:
Fill them with a value
In [12]:
vals3.fillna(0)
Out[12]:
Fill them with a calculated value
In [13]:
vals3.fillna(vals3.mean())
Out[13]:
Use a function like forward fill
In [14]:
vals3.fillna(method='ffill')
Out[14]:
A handy way to get a picture of our data
In [15]:
vals3.describe()
Out[15]:
In [16]:
vals.index=pd.Index(['tom','sally','pablo','florence','ricky','katrin'])
vals3=vals3[['tom','sally','pablo','florence','billy','katrin']]
In [17]:
# create a dataframe
dat = pd.DataFrame({'orig':vals,'new':vals3})
dat
Out[17]:
Check for nulls
In [18]:
dat.isnull()
Out[18]:
Drop rows with nulls
In [19]:
dat.dropna()
Out[19]:
In [20]:
hipster = pd.read_csv('hipster.csv')
hipster[:10]
Out[20]:
Set the index to a datetime
In [21]:
hipster = hipster.set_index(pd.DatetimeIndex(hipster.pop('Date')))
hipster[:10]
Out[21]:
Now load the anti-Hipster data
In [22]:
not_hipster = pd.read_csv('negative-hipster.csv')
not_hipster = not_hipster.set_index(pd.DatetimeIndex(not_hipster.pop('Date')))
In [23]:
not_hipster[:10]
Out[23]:
Check the values of one column
In [24]:
hipster.hipster.head()
Out[24]:
Check another, but get them as an numpy.ndarray
In [25]:
hipster['gumtree perth'].values[:20]
Out[25]:
View the data types, they don't need to be homogenous
In [26]:
hipster.dtypes
Out[26]:
Joins on indexes are easy!
In [27]:
trend = hipster.join(not_hipster, how='inner')
trend.head()
Out[27]:
We can check the column names and values
In [28]:
trend.columns
Out[28]:
In [29]:
trend.values
Out[29]:
Filtering on date ranges is simple
In [30]:
trend['2012-01-01':].head()
Out[30]:
In [31]:
trend['2012-01-01': '2013-01-01'].tail(3)
Out[31]:
We can also grab a single date, or a subset of columns
In [32]:
trend.ix['2012-01-01', ['hipster', 'modcloth']]
Out[32]:
Or do some boolean filtering
In [33]:
trend[trend.techno < 0].head()
Out[33]:
Plotting is built in and easier for dates than matplotlib
In [34]:
_ = trend.plot(figsize=(10, 6))
_ = plt.legend(loc='best', ncol=2)
We can also do it for a single column
In [35]:
_ = trend.hipster.cumsum().plot()
Or split the columns out to subplots
In [36]:
axs = trend.plot(subplots=True, figsize=(10, 10))
Resampling data is also straight forward.
In [37]:
# resample by month
trend.resample('M', how='mean').head()
Out[37]:
and Here by year, but one can do business day, week, month, quarter annual and a bunch of others
In [38]:
# resample by year
_ = trend.resample('A', how='mean').plot(figsize=(10, 10))
Other fancy plots include a scatter matrix including a kernel density estimation (KDE)
In [39]:
# look at the relations
_ = pd.scatter_matrix(trend, figsize=(12,8), diagonal='kde')
In [40]:
df = pd.read_csv('train.csv', header=0)
In [41]:
df.head()
Out[41]:
Lets look at the data types here (this time they're heterogeneous)
In [42]:
df.dtypes
Out[42]:
We can also get a more verbose summary
In [43]:
df.info()
DataFrames can be grouped, like in SQL (it sucked to be a young male on the titanic)
In [44]:
df_grouped = df.groupby(['Pclass', 'Sex'])
In [45]:
df_grouped[['Age', 'Survived']].mean()
Out[45]:
Histograms are straightforward
In [46]:
ax = df['Age'].dropna().hist(bins=20, range=(0,100), alpha = .5)
ax.set_xlabel('Age')
ax.set_ylabel('Passenger Count')
Out[46]:
So are boxplots
In [47]:
bp = df.boxplot(column='Age', by='Pclass', grid=False)
for i in set(df.Pclass):
y = df.Age[df.Pclass==i].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y, 'r.', alpha=0.2)
If we want to do some learning on this data.. lets convert gender to a binary numeric
In [48]:
df['isFemale'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
df[['Sex','isFemale']].head()
Out[48]:
Find non-numeric columns so we can drop them later
In [49]:
drop_cols = df.columns[df.dtypes.map(lambda x: x=='object')]
drop_cols
Out[49]:
In [50]:
df.info()
Setup our data to learn from
In [51]:
X = pd.DataFrame(df[[c for c in df.columns if c != 'Survived']])
X = X.drop(drop_cols, axis=1)
X = X.drop('PassengerId', axis=1)
y = df.Survived
print X.head()
Have a quick look at the class distribution
In [52]:
y.groupby(y.values).count()
Out[52]:
and fill in some NaNs for age
In [53]:
X['Age'] = X.Age.fillna(X.Age.median())
Prediction with scikit-learn is easy - who will survive?
In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score as acc
In [55]:
# create our classifier
clf = LogisticRegression()
# fit it to the data
clf.fit(X, y)
# and predict
preds = clf.predict(X)
res_acc = acc(y, preds)
print 'Accuracy Score: {:.2f}'.format(res_acc)
print 'Not too bad'
In [56]:
from sklearn.cross_validation import KFold
In [57]:
cv = KFold(n=len(y), n_folds=5, shuffle=True)
preds = np.zeros_like(y)
for train, test in cv:
clf = LogisticRegression()
clf.fit(X.ix[train], y.ix[train])
preds[test] = clf.predict(X.ix[test])
res_acc = acc(y, preds)
print 'Accuracy Score: {:.2f}'.format(res_acc)
And cross-validation can be done more easily
In [58]:
# scikits can actually take care of this for us
from sklearn.cross_validation import cross_val_score
# here
clf = LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
# to here
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [59]:
df.Embarked.head()
Out[59]:
In [60]:
set(df.Embarked.fillna('O'))
Out[60]:
Use the LabelEncoder
In [61]:
from sklearn import preprocessing
df.Embarked = df.Embarked.fillna('O')
le = preprocessing.LabelEncoder()
le.fit(df.Embarked.values)
le.classes_
Out[61]:
In [62]:
X['Embarked'] = le.transform(df.Embarked.values)
X.Embarked.head()
Out[62]:
In [63]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
clf = LogisticRegression(C=C, penalty='l1')
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("n_estimators: {:3.3f}\tAccuracy: {:.2f} (+/- {:.2f})"
.format(C, scores.mean(), scores.std() * 2))
In [64]:
# normalise the data
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "LDA",
"QDA", "Logistic Regression"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA(),
LogisticRegression(class_weight='auto')]
In [66]:
# fit each classifier and find the mean performance
res = []
for name, clf in zip(names, classifiers):
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
res.append(scores.mean())
In [67]:
import prettyplotlib as ppl
res = np.array(res)
names = np.array(names)
idx = np.argsort(res)[::-1]
fig, ax = plt.subplots(1, figsize=(14, 6))
ppl.bar(ax, np.arange(len(res)), res[idx], annotate=True,
xticklabels=names[idx], grid='y')
plt.xticks(rotation=30)
_ = ax.set_ylim(res.min() * 0.95, res.max() * 1.05)
Models can be pickled
In [69]:
# models can be saved
import pickle
s = pickle.dumps(clf)
by Andreas Mueller